Slip 28

Q.1. Write a python program to categorize the given news text into one of the available 20 
categories of news groups, using multinomial Naïve Bayes machine learning model.

# Import required libraries
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Step 1: Load the 20 Newsgroups dataset (all categories)
newsgroups = fetch_20newsgroups(subset='all', shuffle=True, random_state=42)

# Step 2: Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(newsgroups.data, newsgroups.target, test_size=0.25, random_state=42)

# Step 3: Build a Pipeline for vectorization + TF-IDF + Multinomial Naive Bayes
text_clf = Pipeline([
    ('vect', CountVectorizer()),          # Convert text to word counts
    ('tfidf', TfidfTransformer()),        # Apply TF-IDF
    ('clf', MultinomialNB()),             # Multinomial Naive Bayes classifier
])

# Step 4: Train the model
text_clf.fit(X_train, y_train)

# Step 5: Test accuracy on test data
y_pred = text_clf.predict(X_test)
print("Model Accuracy:", round(accuracy_score(y_test, y_pred) * 100, 2), "%")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=newsgroups.target_names))

# Step 6: Predict on new custom text inputs
new_texts = [
    "NASA is planning another mission to Mars by next year.",
    "How can I install Linux on my PC?",
    "The new GPU from Nvidia offers excellent gaming performance.",
    "Jesus and the resurrection is discussed heavily in this document.",
    "Baseball teams are preparing for the World Series this season."
]

predicted = text_clf.predict(new_texts)

print("\n--- Predictions on New Text ---")
for text, label_index in zip(new_texts, predicted):
    print(f"\nText: {text}\nPredicted Category: {newsgroups.target_names[label_index]}")

Q.2. Classify the iris flowers dataset using SVM and find out the flower type depending on 
the given input data like sepal length, sepal width, petal length and petal width. Find 
accuracy of all SVM kernels. 

# Import necessary libraries
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# 1. Load the Iris dataset
iris = datasets.load_iris()
X = iris.data     # Features: Sepal length, Sepal width, Petal length, Petal width
y = iris.target   # Labels: 0 - setosa, 1 - versicolor, 2 - virginica

# 2. Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Try all SVM kernels and store accuracy
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
accuracies = {}

for kernel in kernels:
    model = SVC(kernel=kernel)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    accuracies[kernel] = acc
    print(f"Kernel = {kernel}, Accuracy = {acc:.4f}")

# 4. Predict flower type for a custom input
# Example input: Sepal Length=5.1, Sepal Width=3.5, Petal Length=1.4, Petal Width=0.2
custom_input = [[5.1, 3.5, 1.4, 0.2]]
best_kernel = max(accuracies, key=accuracies.get)  # Kernel with highest accuracy
best_model = SVC(kernel=best_kernel)
best_model.fit(X_train, y_train)
predicted_class = best_model.predict(custom_input)[0]
print(f"\nBest Kernel: {best_kernel}")
print(f"Predicted Flower Type for input {custom_input}: {iris.target_names[predicted_class]}")
